# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

# Load the dataset
df = pd.read_csv("cc_approvals.data", header=None) 
df.head()

# check for null values
df.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64

# sometimes null values are stored as "?"
(df == '?').sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13     0
dtype: int64

# replace "?" with np.NaN
df = df.replace('?', np.nan)

# check for null values again
df.isna().sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13     0
dtype: int64

# Iterate over each column of dataframe and impute the most frequent value for object data types and the mean for numeric data types
for col in df.columns:
    # Check if the column is of object type
    if df[col].dtypes == "object":
        # Impute with the most frequent value
        df[col] = df[col].fillna(df[col].value_counts().index[0])
        
    else:
        df[col] = df[col].fillna(df[col].mean())

# create dummy variables for the categorical features
df_encoded = pd.get_dummies(df, drop_first=True)

# define independent and dependent(last columns) variables 
X = df_encoded.iloc[:, :-1].values
y = df_encoded.iloc[:, [-1]].values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Instantiate StandardScaler and use it to rescale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Instantiate a LogisticRegression classifier
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(X_train_scaled, y_train.ravel())

# predict using training data 
y_train_pred = logreg.predict(X_train_scaled)


# Print the confusion matrix of the logreg model
print(confusion_matrix(y_train, y_train_pred))

[[203   1]
 [  1 257]]

# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# Create a dictionary of tol and max_iter 
param_grid = dict(tol=tol, max_iter=max_iter)

# Instantiate GridSearchCV
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)


# Fit grid_model to the data
grid_model_result = grid_model.fit(X_train_scaled, y_train.ravel())


# Summarize results
best_train_score, best_train_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_train_score, best_train_params))

Best: 0.820313 using {'max_iter': 100, 'tol': 0.0001}

# Extract the best model and evaluate it on the test set
best_model = grid_model_result.best_estimator_
best_score =  best_model.score(X_test_scaled, y_test)

print("Accuracy of logistic regression classifier: ", best_score)

Accuracy of logistic regression classifier:  0.8070175438596491

	0	1	2	3	4	5	6	7	8	9	10	11	12	13
0	b	30.83	0.000	u	g	w	v	1.25	t	t	1	g	0	+
1	a	58.67	4.460	u	g	q	h	3.04	t	t	6	g	560	+
2	a	24.50	0.500	u	g	q	h	1.50	t	f	0	g	824	+
3	b	27.83	1.540	u	g	w	v	3.75	t	t	5	g	3	+
4	b	20.17	5.625	u	g	w	v	1.71	t	f	0	s	0	+

Project Description¶

The Data¶

Goal¶

Our task is to use supervised learning techniques to automate the credit card approval process for banks.¶

Project Description¶

The Data¶

Goal¶

Our task is to use supervised learning techniques to automate the credit card approval process for banks.¶

The test accuracy (80.7%) is good but can be improved with some refinements. Note that we can continue trying other models and keep on tuning our models untill we reach our desired results. Thank you!¶